%config InlineBackend.figure_formats = ["retina"]
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
from more_itertools import unique_everseen
import scanpy as sc
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.n_jobs = 16
import anndata2ri
anndata2ri.activate()
%reload_ext rpy2.ipython
plt.rcParams.update({"font.family":"Reem Kufi"})
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
# Custom heatmap colors
heatcolors = matplotlib.colors.LinearSegmentedColormap.from_list("", ["gray", "#000004FF", "#330A5FFF", "#781C6DFF",
"#BB3754FF", "#ED6925FF",
"#FCB519FF", "#FCFFA4FF"])
heatcolors_wr = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "#FFF5F0", "#FEE0D2", "#FCBBA1",
"#FC9272", "#FB6A4A", "#EF3B2C",
"#CB181D", "#A50F15", "#67000D"])
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
%%R
# Load the libraries
suppressWarnings(suppressMessages(require(scran)))
suppressWarnings(suppressMessages(require(scater)))
suppressWarnings(suppressMessages(library(DropletUtils)))
suppressPackageStartupMessages(library(batchelor))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(circlize))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyverse))
options(mc.cores = 24L)
%%R
# Load the individual data
MUX8555 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas1_count/outs/filtered_feature_bc_matrix/")
colData(MUX8555)$Sample <- rep("MUX8555", ncol(MUX8555))
MUX8990 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas2_count/outs/filtered_feature_bc_matrix/")
colData(MUX8990)$Sample <- rep("MUX8990", ncol(MUX8990))
MUX8991 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas3_count/outs/filtered_feature_bc_matrix/")
colData(MUX8991)$Sample <- rep("MUX8991", ncol(MUX8991))
MUX9780 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas4_count/outs/filtered_feature_bc_matrix/")
colData(MUX9780)$Sample <- rep("MUX9780", ncol(MUX9780))
SSEA4 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas4_SSEA4_count/outs/filtered_feature_bc_matrix/")
colData(SSEA4)$Sample <- rep("SSEA4", ncol(SSEA4))
%%R
bcrank <- barcodeRanks(counts(MUX8555))
# Knee-point plots on the raw data
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
bcrank <- barcodeRanks(counts(MUX8990))
# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
bcrank <- barcodeRanks(counts(MUX8991))
# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
bcrank <- barcodeRanks(counts(MUX9780))
# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
bcrank <- barcodeRanks(counts(SSEA4))
# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
# Merge all datasets
mf_sce <- cbind(MUX8555, MUX8990, MUX8991, MUX9780, SSEA4)
rownames(mf_sce) <- rowData(mf_sce)$Symbol
print(mf_sce)
%%R
bcrank <- barcodeRanks(counts(mf_sce))
# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
MT.Genes <- c("ATP6", "ATP8", "COX1", "COX2", "COX3", "CYTB", "ND1", "ND2", "ND3", "ND4", "ND4L", "ND5", "ND6")
# Calculate QC metrics
mf_sce <- addPerCellQC(mf_sce, subsets = list(mt=MT.Genes))
# Remove low quality cells i.e., those with less than 200 genes expressed and less than 3500 UMI counts
mf_sce <- mf_sce[,mf_sce$detected > 200 & mf_sce$sum > 3500 & mf_sce$sum < 100000 & mf_sce$subsets_mt_percent < 20]
# Remove unxpressed genes
mf_sce <- mf_sce[Matrix::rowSums(counts(mf_sce)) > 0, ]
%%R
# Post mitochondrial content and UMI based filtering
print(mf_sce)
%%R
# Normalization
set.seed(55063)
clusters <- quickCluster(mf_sce)
mf_sce <- computeSumFactors(mf_sce, cluster = clusters)
mf_sce <- logNormCounts(mf_sce)
# Identify HVGs
set.seed(55063)
mod_mf <- modelGeneVar(mf_sce)
hvgs_mf <- getTopHVGs(mod_mf, prop = 0.1)
# Dimensionality reduction
set.seed(55063)
mf_sce <- runPCA(mf_sce, subset_row = hvgs_mf)
mf_sce <- runTSNE(mf_sce, dimred = "PCA", num_threads = 24, verbose = F)
%%R
# Perform Batch correction
# Identify HVGs accounting for different samples
set.seed(55063)
mod_mf_batch <- modelGeneVar(mf_sce, block = colData(mf_sce)$Sample)
hvgs_mf_batch <- getTopHVGs(mod_mf_batch, prop = 0.1)
set.seed(55063)
mf_sce_corrected <- fastMNN(mf_sce, subset.row = hvgs_mf_batch, batch=mf_sce$Sample)
set.seed(55063)
mf_sce_corrected <- runTSNE(mf_sce_corrected, dimred = "corrected", perplexity=300, num_threads = 12, verbose = F)
reducedDim(mf_sce, "mnn") <- reducedDim(mf_sce_corrected, "corrected")
reducedDim(mf_sce, "mnnTSNE") <- reducedDim(mf_sce_corrected, "TSNE")
%%R -o mf_sce
# Number of HVGs used for batch correction
length(hvgs_mf_batch)
mf_sce.var_names_make_unique()
mf_sce.obs['Library'] = mf_sce.obs['Sample']
mf_sce.obs['batch'] = mf_sce.obs['Library'].replace({
"MUX8555":"-0",
"MUX8990": "-1",
"MUX8991": "-2",
"MUX9780": "-3",
"SSEA4": "-4"
})
mf_sce.obs['Sample'] = mf_sce.obs['Library'].replace({
"MUX8555":"Adult1",
"MUX8990": "Infant",
"MUX8991": "Juvenile",
"MUX9780": "Adult2",
"SSEA4": "Adult2_SSEA4+"
})
mf_sce.obs.index = mf_sce.obs['Barcode']+mf_sce.obs['batch']
mf_sce.obs['Sample'].value_counts() # Number of cells in each animal
mf_sce.obs['sum'].mean() # Mean of UMI counts across all cells
mf_sce.obs['sum'].describe() # Descriptive statistics of UMI counts across all cells
mf_sce.obs['detected'].mean() # Mean of genes detected across all cells
mf_sce.obs['detected'].describe() # Descriptive statistics of genes detected across all cells
mf_sce.obs["Sample"] = mf_sce.obs["Sample"].astype("category").cat.reorder_categories(["Infant", "Juvenile", "Adult1", "Adult2", "Adult2_SSEA4+"],ordered=True)
sc.pl.tsne(mf_sce, color = ["Sample"], palette=["royalblue", "forestgreen", "red", "darkorange", "skyblue"],
size = 10,
edgecolor = "black",
linewidths = 0.1,
ncols = 2,
legend_fontsize = 10,legend_loc="right margin", title="Before batch correction")
mf_sce.obsm['X_tsne'] = mf_sce.obsm['mnnTSNE']
sc.pl.tsne(mf_sce, color = ["sum"],
size = 10,
edgecolor = "black", cmap = "viridis",
linewidths = 0.1,
ncols = 2,
legend_fontsize = 10, legend_loc="right margin", title="No. of UMIs")
sc.pl.tsne(mf_sce, color = ["Sample"],
size = 10,
edgecolor = "black",
linewidths = 0.1,
ncols = 2,
legend_fontsize = 10, legend_loc="right margin", title="After batch correction")
for s in ["Infant", "Juvenile", "Adult1", "Adult2", "Adult2_SSEA4+"]:
sc.pl.tsne(mf_sce, color=[ "Sample"], palette=["royalblue", "forestgreen", "red", "darkorange", "skyblue"],
groups=s,size=10, edgecolor="black", linewidths=0.05, ncols=1, legend_fontsize=10, cmap="viridis",
legend_fontoutline=1, title=s)
mf_sce.obsm["X_pca"] = mf_sce.obsm["mnn"]
sc.pp.neighbors(mf_sce, n_neighbors = 5)
sc.tl.leiden(mf_sce, resolution = 2, key_added = 'leiden_2')
sc.pl.tsne(mf_sce, color=["leiden_2"], size=10, legend_loc = "on data", edgecolor = "black", linewidths = 0.1, ncols = 3,
legend_fontsize = 8, alpha=1, legend_fontoutline=1,
title="Leiden clustering: resolution of 2 with n_neighbors of 5")
wv = sc.pl.stacked_violin(mf_sce, var_names=["DDX4", "ENSMFAG00000037727", "ID4", "FGFR3", "SOHLH1", "UCHL1", "REC8",
"STRA8", "SYCP3", "AURKA","ACRV1", "SPACA1", "PRM2","OAZ3", "SPEM1", "VIM",
"CD74", "VWF", "ACTA2", "TAGLN", "CLU", "AMH", "SOX9","INSL3", "INHBA", "MYH11",
"MYH1", "CYP11A", "ID2", "RORA", "IGF1", "IGF1", "IGFBP5", "IGFBP5", "MYL9", "TPM2"],
groupby="leiden_2", swap_axes=True, layer="logcounts")
mf_sce.obs['broadcluster'] = mf_sce.obs['leiden_2']
mf_sce.obs['broadcluster'] = mf_sce.obs['broadcluster'].astype('str')
mf_sce.obs['broadcluster'][mf_sce.obs['broadcluster'].isin(["0", "3", "5", "39", "41", "42"])]="Somatic"
mf_sce.obs['broadcluster'][mf_sce.obs['broadcluster'].isin(["31", "37", "40", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53"])] = "Outliers"
mf_sce.obs['broadcluster'][~mf_sce.obs['broadcluster'].isin(["Outliers", "Somatic"])] = "Germ"
sc.pl.tsne(mf_sce, color=["broadcluster"],
size=10,
edgecolor = "black",
linewidths = 0.1,
legend_fontsize = 10,
alpha=1, title="")
mf_leiden_clusters = mf_sce.obs['leiden_2'] # Save the cluster info to a variable and export it to R #do not re-run after clusters are called -> i.e. for the next step to be stable
%%R -i mf_leiden_clusters
mf_sce$mf_leiden_clusters = mf_leiden_clusters
mf_sce$broadcluster <- as.character(mf_sce$mf_leiden_clusters)
%%R
# Based on marker gene expression, assign identity to the cells i.e., Germ, somatic, and outliers
mf_sce$broadcluster[mf_sce$broadcluster %in% c("0", "3", "5", "39", "41", "42")] = "Somatic"
mf_sce$broadcluster[mf_sce$broadcluster %in% c("31", "37", "40", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53")] = "Outliers"
mf_sce$broadcluster[!mf_sce$broadcluster %in% c("Outliers", "Somatic")] = "Germ"
%%R -o mf_sce_gs
# Exclude the Outlier cells and perform batch correction
mf_sce_gs <- mf_sce[,mf_sce$broadcluster!="Outliers"]
mf_sce_gs <- mf_sce_gs[Matrix::rowSums(counts(mf_sce_gs)) > 0, ]
mf_sce_gs$mf_leiden_clusters <- factor(mf_sce_gs$mf_leiden_clusters)
# Identify HVGs based on sample
set.seed(55063)
mod_mf_batch_gs <- modelGeneVar(mf_sce_gs, block=colData(mf_sce_gs)$Sample)
hvgs_mf_batch_gs <- getTopHVGs(mod_mf_batch_gs, prop=0.1)
print(paste("HVGs used:",length(hvgs_mf_batch_gs)))
set.seed(55063)
mf_sce_gs_corrected <- fastMNN(mf_sce_gs, subset.row=hvgs_mf_batch_gs, batch=mf_sce_gs$Sample)
reducedDim(mf_sce_gs, "mnn") <- reducedDim(mf_sce_gs_corrected, "corrected")
mf_sce_gs$mf_leiden_clusters <- as.character(mf_sce_gs$mf_leiden_clusters)
mf_sce_gs.var_names_make_unique()
mf_sce_gs.obs['Library'] = mf_sce_gs.obs['Sample']
mf_sce_gs.obs['batch'] = mf_sce_gs.obs['Library'].replace({
"MUX8555":"-0",
"MUX8990": "-1",
"MUX8991": "-2",
"MUX9780": "-3",
"SSEA4": "-4"
})
mf_sce_gs.obs['Sample'] = mf_sce_gs.obs['Library'].replace({
"MUX8555":"Adult1",
"MUX8990": "Infant",
"MUX8991": "Juvenile",
"MUX9780": "Adult2",
"SSEA4": "Adult2_SSEA4+"
})
mf_sce_gs.obs.index = mf_sce_gs.obs['Barcode']+mf_sce_gs.obs['batch']
mf_sce_gs.obs["Sample"] = mf_sce_gs.obs["Sample"].astype("category").cat.reorder_categories(["Infant", "Juvenile", "Adult1", "Adult2", "Adult2_SSEA4+"],ordered=True)
mf_sce_gs.obs['sum'].describe() #Descriptive statistics of UMI counts across all cells excluding outliers
mf_sce_gs.obs['detected'].describe() #Descriptive statistics of genes detected across all cells excluding outliers
mf_sce_gs.obsm['X_tsne']=mf_sce_gs.obsm['mnnTSNE']
mf_sce_gs.obsm['X_pca']=mf_sce_gs.obsm['mnn']
sc.pl.tsne(mf_sce_gs, color=["broadcluster"], size=8,
legend_loc="on data", edgecolor="black", linewidths=0.1, ncols=3, legend_fontsize=6, alpha=0.5, legend_fontoutline=1)
sc.tl.dendrogram(mf_sce_gs, groupby="mf_leiden_clusters")
sc.pl.correlation_matrix(mf_sce_gs, groupby="mf_leiden_clusters", figsize=(10,7))
sc.pl.stacked_violin(mf_sce_gs, groupby="mf_leiden_clusters",
var_names=["ACTA2", "TAGLN", "INHBA","CYP11A", "INSL3", "AMH",
"CLU", "CD74", "VWF", "ID4", "ENSMFAG00000037727", "UCHL1", "SOHLH1", "STRA8",
"SYCP3", "PIWIL1", "AURKA", "ACRV1", "NME5", "AKAP14", "TNP2", "PRM2", "OAZ3", "CRISP2", "SPEM1"],
swap_axes=True, layer="logcounts");
sc.pl.tsne(mf_sce_gs, color=["ACTA2", "TAGLN", "INHBA","CYP11A", "INSL3", "AMH", "CLU", "CD74", "VWF",
"ID4", "UCHL1", "SOHLH1", "STRA8", "SYCP3", "PIWIL1", "AURKA", "ACRV1", "NME5",
"AKAP14", "TNP2", "PRM2", "OAZ3", "CRISP2", "SPEM1"],
legend_fontsize=8, color_map=heatcolors,
size=20, edgecolor="black", linewidths=0.1, ncols=4, layer="logcounts", wspace=0.1, hspace=0.2)
gsconditions = [
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["0"])),#Myoid
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["3"])),#Immature Leydig
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["42"])),#Leydig
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["39"])),#Sertoli
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["41"])),#Macrophage
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["5"])),#Endothelial
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["43", "9", "4", "28", "2", "34"])),#Spermatogonia
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["35", "27", "36", "33", "17","8", "16",
"25", "1", "24", "15", "21", "10", "11", "26"])),#Spermatocytes
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["13","12", "22", "20", "23", "32", "29"])),#Round spermatids
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["38", "14", "18", "7", "30", "6"])),#Elongating Spermatids
(mf_sce_gs.obs["mf_leiden_clusters"].isin(["19"]))]#Sperm
gsgroups = ["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial",
"Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Immature Sperm"]
gs_clusters = np.select(gsconditions, gsgroups, default="Germ")
mf_sce_gs.obs["broad_clusters"] = gs_clusters
mf_sce_gs.obs["broad_clusters"] = mf_sce_gs.obs["broad_clusters"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial",
"Spermatogonia", "Spermatocytes", "Round spermatids","Elongating spermatids","Immature Sperm"],ordered=True)
mf_sce_gs.obs['broad_clusters_short'] = mf_sce_gs.obs['broad_clusters']
mf_sce_gs.obs['broad_clusters_short'] = mf_sce_gs.obs['broad_clusters_short'].replace({
"Round spermatids":"Round STids",
"Elongating spermatids": "Elongating STids"
})
mf_sce_gs.obs['mf_leiden_clusters_celltype'] = mf_sce_gs.obs['mf_leiden_clusters'].astype(str) +"-"+mf_sce_gs.obs['broad_clusters_short'].astype(str)
mf_sce_gs.obs['mf_broadclusters_celltype_Sample'] = mf_sce_gs.obs['broad_clusters_short'].astype(str)+"-"+mf_sce_gs.obs['Sample'].astype(str)
cols_clusters = ["#FFB292","gold", "lightslategray", "#936C00","lime", "red", "#F16913", "#4C7C5E", "#2171B5","#eaa9bd", "#91357d"]
sc.pl.tsne(mf_sce_gs, color=["broad_clusters"], legend_fontsize=10, palette=cols_clusters,
size=20, edgecolor="black", linewidths=0.1, title="")
sc.tl.dendrogram(mf_sce_gs, groupby="mf_leiden_clusters_celltype")
sc.pl.correlation_matrix(mf_sce_gs, groupby="mf_leiden_clusters_celltype", figsize=(10,6), dendrogram=True)
sc.tl.dendrogram(mf_sce_gs, groupby="mf_broadclusters_celltype_Sample")
sc.pl.correlation_matrix(mf_sce_gs, groupby="mf_broadclusters_celltype_Sample", figsize=(12,8), dendrogram=True)
sc.tl.rank_genes_groups(mf_sce_gs, "broad_clusters", n_genes = 6000, method = "wilcoxon", layer = "logcounts", use_raw = False)
gslist = {}
for i in mf_sce_gs.obs["broad_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(mf_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by=["logfoldchanges"], ascending=[False]).dropna(axis=0)["names"].to_list()
gslist[i] = genes
for key, value in gslist.items():
#print value
print(key, len([item for item in value if item]))
gs_L1 = []
for i in mf_sce_gs.obs["broad_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(mf_sce_gs, group=i,pval_cutoff=0.01,
log2fc_min=1.25, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
gs_L1.extend(genes)
gs_unique_genes = list(unique_everseen(gs_L1))
len(gs_unique_genes)
ax, mean = sc.pl.matrixplot(mf_sce_gs, gs_unique_genes, groupby="broad_clusters", figsize=(3, 4),
standard_scale="var", linewidth=.000001,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show=False,
show_gene_labels=False)
pd.crosstab(mf_sce_gs.obs['Sample'], mf_sce_gs.obs['broad_clusters'])
cellcounts = pd.crosstab( mf_sce_gs.obs['Sample'], mf_sce_gs.obs['broad_clusters'])
def reset_index(df):
'''Returns DataFrame with index as columns'''
index_df = df.index.to_frame(index=False)
df = df.reset_index(drop=True)
return pd.merge(index_df, df, left_index=True, right_index=True)
cellcounts = reset_index(cellcounts)
germ_cellcounts = cellcounts[['Sample', 'Spermatogonia','Spermatocytes', 'Round spermatids', 'Elongating spermatids','Immature Sperm']]
somatic_cellcounts = cellcounts[['Sample', 'Myoid', 'Immature Leydig', 'Leydig', 'Sertoli','Macrophage', 'Endothelial']]
germ_cellcounts.set_index('Sample', inplace=True)
## Draw pie chart to show cell count distribution
germ_cellcounts = germ_cellcounts.div(germ_cellcounts.sum(axis=1), axis=0)
fig, axs = plt.subplots(ncols=germ_cellcounts.index.size, nrows=1, figsize=(7,2))
fig.subplots_adjust(hspace=0.5, wspace=0.05)
#l=[]
for row in range(germ_cellcounts.index.size):
fig.add_subplot(axs[row] )
plt.pie(germ_cellcounts.loc[germ_cellcounts.index[row],:],labels=None, colors=["#F16913", "#4C7C5E", "#2171B5","#eaa9bd", "#91357d"])
plt.title(germ_cellcounts.index[row], fontsize=12)
#plt.axis('equal')
plt.tight_layout()
plt.axis('off')
fig.legend(germ_cellcounts.columns, ncol=1,loc='upper left', bbox_to_anchor=(1, 0.7), fontsize=10);
somatic_cellcounts.set_index('Sample', inplace=True)
somatic_cellcounts = somatic_cellcounts.div(somatic_cellcounts.sum(axis=1), axis=0)
fig, axs = plt.subplots(ncols=somatic_cellcounts.index.size, nrows=1, figsize=(7,2))
fig.subplots_adjust(hspace=0.5, wspace=0.05)
#l=[]
for row in range(somatic_cellcounts.index.size):
fig.add_subplot(axs[row] )
plt.pie(somatic_cellcounts.loc[somatic_cellcounts.index[row],:],labels=None, colors=["#FFB292","gold", "lightslategray", "#936C00","lime", "red"])
plt.title(somatic_cellcounts.index[row], fontsize=12)
#plt.axis('equal')
plt.tight_layout()
plt.axis('off')
fig.legend(somatic_cellcounts.columns, ncol=1,loc='upper left', bbox_to_anchor=(1, 0.8), fontsize=10);
## GO term plot
sc.settings.set_figure_params(dpi=150, dpi_save=300, format="pdf", fontsize=8)
cols_clusters = ["gold","lightslategray", "lime", "#DAF7A6", "darkkhaki", "red", "#F16913", "#4C7C5E", "#2171B5","#eaa9bd", "#91357d"]
GO_terms = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/17Apr_rev/Macaque/GO_terms_selected_28Apr.csv")
GO_terms['CellType'] = GO_terms['CellType'].replace({
'Sperm': 'Immature Sperm'
})
GO_terms['-log10(p-value)'] = np.log10(GO_terms.p_value)*-1
plt.rcParams['xtick.labelsize']=6
plt.rcParams['ytick.labelsize']=6
plt.rcParams['axes.grid']=False
go = sns.FacetGrid(GO_terms, col="CellType", sharex=False, sharey=False,hue="CellType", col_wrap=4, aspect=2.2, size=1.1, palette=cols_clusters)#For 3 = aspect=2.2, size=0.75, for 4 = aspect=2.8, size=0.9
go.map(plt.barh, "name", "-log10(p-value)", height=0.6).set_titles("").set_axis_labels("","").set_titles("{col_name}")#.set_tickparams(axis="both", which="both", pad=0.01)#.set_xticklabels(length=0.1)#.set_titles("{col_name}").set_axis_labels("-log10(P)","")
go.fig.subplots_adjust(wspace=3.2, hspace=0.95)
mf_sce_gs.X = mf_sce_gs.layers["logcounts"]
# Load gene annnotation file
allgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/gene_annotation.txt", index_col=0)
# Retrive chromosomes X, Y genes
chrX_genes = allgenes["Chromosome"]=="X"
chrY_genes = allgenes["Chromosome"]=="Y"
# Retrieve the X and Y genes detected in the dataset
Xgenes = set(allgenes[chrX_genes].index.tolist()).intersection(mf_sce_gs.var_names.tolist())
Ygenes = set(allgenes[chrY_genes].index.tolist()).intersection(mf_sce_gs.var_names.tolist())
mf_sce_gs.obs["ChrX percent"] = (np.sum(
mf_sce_gs[:, list(Xgenes)].X, axis=1).A1 / np.sum(mf_sce_gs.X, axis=1).A1)*100
mf_sce_gs.obs["ChrY percent"] = (np.sum(
mf_sce_gs[:, list(Ygenes)].X, axis=1).A1 / np.sum(mf_sce_gs.X, axis=1).A1)*100
# Make a dataframe using the selected columns
dist_data = mf_sce_gs.obs['broad_clusters'].to_frame().join(mf_sce_gs.obs['detected']).join(mf_sce_gs.obs['sum']).join(mf_sce_gs.obs['subsets_mt_percent']).join(mf_sce_gs.obs['ChrX percent']).join(mf_sce_gs.obs['ChrY percent'])
dist_data.columns = ['Clusters', 'No. of genes detected', 'UMI Count', '% Mito', '% ChrX', '% ChrY']
# Melt the data to long format
dist_data_melted = dist_data.melt(id_vars='Clusters', var_name='key', value_name='value')
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf")
# Set context to `"paper"`
sns.set_context("paper", font_scale=2, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":12})
plt.rcParams["axes.linewidth"] = 0.5
plt.rcParams["figure.titlesize"]=12
g = sns.FacetGrid(dist_data_melted, col="key", height=5, sharex=False, hue="Clusters", aspect=.8, palette=cols_clusters)
g.map(sns.violinplot, "value", "Clusters", label='xxlarge', linewidth=0.5).set_titles("{col_name}", size=18).set_axis_labels("","")
g.fig.tight_layout()
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
# Set context to `"paper"`
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":12})
plt.rcParams["axes.linewidth"] = 1
%%R -o mf_sce_germSPG
## Germ SPG
mf_sce_germSPG <- mf_sce_gs[,mf_sce_gs$mf_leiden_clusters %in% c('43', '9', '4', '28', '2', '34', '35', '27')]
mf_sce_germSPG <- mf_sce_germSPG[Matrix::rowSums(counts(mf_sce_germSPG)) > 0,]
mf_sce_germSPG$mf_leiden_clusters <- as.character(mf_sce_germSPG$mf_leiden_clusters)
mf_sce_germSPG$mf_leiden_clusters <- factor(mf_sce_germSPG$mf_leiden_clusters)
#Identify HVGs based on sample
set.seed(55063)
mod_mf_batch_germ_SPG <- modelGeneVar(mf_sce_germSPG, block=colData(mf_sce_germSPG)$Sample)
hvgs_mf_batch_germ_SPG <- getTopHVGs(mod_mf_batch_germ_SPG, prop=0.1)
set.seed(55063)
mf_sce_germ_SPG_corrected <- fastMNN(mf_sce_germSPG, subset.row=hvgs_mf_batch_germ_SPG, batch=mf_sce_germSPG$Sample)
set.seed(55063)
mf_sce_germ_SPG_corrected <- runTSNE(mf_sce_germ_SPG_corrected, dimred="corrected",
verbose = F,
num_threads = 24)
reducedDim(mf_sce_germSPG, "mnn") <- reducedDim(mf_sce_germ_SPG_corrected, "corrected")
reducedDim(mf_sce_germSPG, "TSNE") <- reducedDim(mf_sce_germ_SPG_corrected, "TSNE")
reducedDim(mf_sce_germSPG, "PCA") <- reducedDim(mf_sce_germ_SPG_corrected, "mnn")
mf_sce_germSPG$mf_leiden_clusters <- as.character(mf_sce_germSPG$mf_leiden_clusters)
mf_sce_germSPG.var_names_make_unique()
mf_sce_germSPG.obs['Library'] = mf_sce_germSPG.obs['Sample']
mf_sce_germSPG.obs['batch'] = mf_sce_germSPG.obs['Library'].replace({
"MUX8555":"-0",
"MUX8990": "-1",
"MUX8991": "-2",
"MUX9780": "-3",
"SSEA4": "-4"
})
mf_sce_germSPG.obs['Sample'] = mf_sce_germSPG.obs['Library'].replace({
"MUX8555":"Adult1",
"MUX8990": "Infant",
"MUX8991": "Juvenile",
"MUX9780": "Adult2",
"SSEA4": "Adult2_SSEA4+"
})
mf_sce_germSPG.obs.index = mf_sce_germSPG.obs['Barcode']+mf_sce_germSPG.obs['batch']
mf_sce_germSPG.obs['Sample'].value_counts()
mf_sce_germSPG.obs['sum'].describe() #Descriptive statistics of UMI counts across spermatogonia cells
mf_sce_germSPG.obs['detected'].describe() #Descriptive statistics of genes detected across spermatogonia cells
mf_sce_germSPG.obsm["X_pca"] = mf_sce_germSPG.obsm["mnn"]
mf_sce_germSPG.X = mf_sce_germSPG.layers["logcounts"]
# Cell cycle scoring
sc.tl.score_genes_cell_cycle(mf_sce_germSPG,
s_genes=["MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1", "UNG", "GINS2",
"MCM6", "CDCA7", "DTL", "ENSMFAG00000036445", "UHRF1", "CENPU", "HELLS",
"RFC2", "RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76","SLBP", "CCNE2", "UBR7",
"POLD3", "MSH2", "ATAD2", "RAD51", "RRM2", "CDC45", "CDC6", "EXO1", "TIPIN",
"DSCC1", "BLM", "CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8"],
g2m_genes=["HMGB2", "CDK1", "ENSMFAG00000039976", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80",
"ENSMFAG00000043640", "NUF2", "ENSMFAG00000002089", "MKI67", "TMPO", "CENPF", "TACC3",
"PIMREG", "SMC4", "CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E", "TUBB4B",
"GTSE1", "KIF20B", "HJURP", "CDCA3", "JPT1", "CDC20", "TTK", "CDC25C", "KIF2C", "RANGAP1",
"NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", "ENSMFAG00000041684", "HMMR", "AURKA", "PSRC1",
"ANLN", "LBR", "CKAP5", "CENPE", "CTCF", "NEK2", "G2E3", "GAS2L3", "CBX5", "CENPA"])
mf_sce_germSPG.obs["Sample"] = mf_sce_germSPG.obs["Sample"].astype("category").cat.reorder_categories(["Infant", "Juvenile", "Adult1", "Adult2", "Adult2_SSEA4+"],ordered=True)
sc.pl.tsne(mf_sce_germSPG, color=[ "Sample"], palette=["royalblue", "forestgreen", "red", "darkorange", "skyblue"],
size=40, edgecolor="black", linewidths=0.05, ncols=1, legend_fontsize=10, cmap="viridis", legend_fontoutline=1)
for s in ["Infant", "Juvenile", "Adult1", "Adult2", "Adult2_SSEA4+"]:
sc.pl.tsne(mf_sce_germSPG, color=[ "Sample"], palette=["royalblue", "forestgreen", "red", "darkorange", "skyblue"], groups=s,
size=40, edgecolor="black", linewidths=0.05, ncols=1, legend_fontsize=10, cmap="viridis", legend_fontoutline=1, title=s)
sc.pl.tsne(mf_sce_germSPG, color=[ "S_score", "G2M_score"],
size=40, edgecolor="black", linewidths=0.05, ncols=2, legend_fontsize=10, cmap="viridis", legend_fontoutline=1)
sc.pl.tsne(mf_sce_germSPG, color=["phase"], palette=["black", "lightgray", "red"],
size=40, edgecolor="black", linewidths=0.125, ncols=2, legend_fontsize=10, cmap=heatcolors, legend_fontoutline=1, title="")
sc.pl.tsne(mf_sce_germSPG, color=["ENSMFAG00000046145", "ENSMFAG00000037727", "PIWIL4", "ID4", "SOHLH1", "EGR4", "FGFR3", "NANOS3",
"UCHL1", "L1TD1", "ASB9", "DMRT1","DMRTB1", "GCNA", "STRA8","KIT","REC8","SYCP3", "PRDM9", "SPO11"],
legend_loc="on data", legend_fontsize=6, size=40, edgecolor="black", linewidths=0.1,wspace=0.1, hspace=0.2, cmap=heatcolors,
ncols=4, layer="logcounts")
sc.pp.neighbors(mf_sce_germSPG, n_neighbors = 10)
sc.tl.leiden(mf_sce_germSPG, resolution = 0.5) # Perform clustering
sc.pl.tsne(mf_sce_germSPG, color=["leiden"],legend_loc="on data", size=40, edgecolor="black", linewidths=0.1, ncols=3, legend_fontsize=8, alpha=0.6, legend_fontoutline=1)
spg_conditions = [
(mf_sce_germSPG.obs["leiden"].isin(["0"])),#Undiff1
(mf_sce_germSPG.obs["leiden"].isin(["2"])),#Undiff2
(mf_sce_germSPG.obs["leiden"].isin(["5"])),#Undiff3
(mf_sce_germSPG.obs["leiden"].isin(["3"])),#Ediff1
(mf_sce_germSPG.obs["leiden"].isin(["1"])),#Ediff2
(mf_sce_germSPG.obs["leiden"].isin(["4"])),#Ediff3
(mf_sce_germSPG.obs["leiden"].isin(["8"])),#Ediff4
(mf_sce_germSPG.obs["leiden"].isin(["7"])),#Mid diff
(mf_sce_germSPG.obs["leiden"].isin(["6"])),#Late diff
(mf_sce_germSPG.obs["leiden"].isin(["10"])),#pre-Lep
(mf_sce_germSPG.obs["leiden"].isin(["9"]))]#Lep
spg_groups = ["Undiff1", "Undiff2", "Undiff3", "E-diff1", "E-diff2", "E-diff3", "E-diff4", "Mid diff", "Late diff", "pre-Lep", "Lep"]
spg_clusters = np.select(spg_conditions, spg_groups, default="Germ")
mf_sce_germSPG.obs["Annotated"] = spg_clusters
mf_sce_germSPG.obs["Annotated"] = mf_sce_germSPG.obs["Annotated"].astype("category").cat.reorder_categories(["Undiff1", "Undiff2","Undiff3",
"E-diff1", "E-diff2", "E-diff3", "E-diff4",
"Mid diff", "Late diff", "pre-Lep", "Lep"],ordered=True)
spg_colors = ["#FFF5EB", "#FEE6CE", "#FDD0A2", "#FDAE6B", "#FD8D3C", "#F16913",
"#D94801", "#A63603", "#7F2704","#E4EDE4", "#BED1C2"]
sc.pl.tsne(mf_sce_germSPG, color=["Annotated"],palette=spg_colors,size=40, edgecolor="black",legend_loc="on data", legend_fontoutline=1,
linewidths=0.15, ncols=3, legend_fontsize=10, title="")
pd.crosstab(mf_sce_germSPG.obs['Annotated'], mf_sce_germSPG.obs['phase'])
pd.crosstab(mf_sce_germSPG.obs['Annotated'], mf_sce_germSPG.obs['Sample'])
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'grey'
sc.pl.tracksplot(mf_sce_germSPG, var_names=["ENSMFAG00000046145", "ENSMFAG00000037727", "PIWIL4", "ID4", "SOHLH1", "EGR4", "FGFR3", "NANOS3", "UCHL1", "L1TD1", "ASB9",
"DMRT1","DMRTB1", "GCNA", "STRA8","KIT","REC8","SYCP3", "PRDM9", "SPO11"], groupby="Annotated", layer="logcounts");
plt.rcdefaults()
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf")#, vector_friendly=False)
# Set context to `"paper"`
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
plt.rcParams.update({"font.family":"Reem Kufi"})
sc.tl.paga(mf_sce_germSPG, groups="Annotated")
plt.rcParams['figure.figsize']=5,5
sc.pl.paga(mf_sce_germSPG, color="Annotated", threshold = 0.028, fontsize=10, fontoutline=1, node_size_power=1.25, node_size_scale=2.5, frameon=False)
sc.tl.rank_genes_groups(mf_sce_germSPG, "Annotated", n_genes=6000, method="wilcoxon", layer="logcounts", use_raw=False, rankby_abs=True)
spgAnnolist = {}
for i in mf_sce_germSPG.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(mf_sce_germSPG, group=i,pval_cutoff=0.05, log2fc_min=1.25, key="rank_genes_groups").sort_values(by="scores", ascending=False).dropna(axis=0)["names"].to_list()
spgAnnolist[i] = genes
pd.DataFrame.from_dict(spgAnnolist, orient="index").transpose().head(10)
SPG_L1 = []
for i in mf_sce_germSPG.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(mf_sce_germSPG, group=i,pval_cutoff=0.01, log2fc_min=1.25, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
SPG_L1.extend(genes)
len(SPG_L1)
SPG_unique_genes = list(unique_everseen(SPG_L1))
len(SPG_unique_genes)
mat_spg = sc.pl.matrixplot(mf_sce_germSPG, SPG_L1, groupby="Annotated", figsize=(4,4),standard_scale="var", linewidth=.0001,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
plt.rcParams['figure.figsize']=4,0.75
sc.pl.dendrogram(mf_sce_germSPG, groupby="Annotated");
import scvelo as scv
print(scv.__version__)
MUX8555_loom = scv.read("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas1_count/velocyto/MacFas1_count.loom", cache=True);
MUX8555_loom.var_names_make_unique()
MUX8990_loom = scv.read("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas2_count/velocyto/MacFas2_count.loom", cache=True);
MUX8990_loom.var_names_make_unique()
MUX8991_loom = scv.read("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas3_count/velocyto/MacFas3_count.loom", cache=True);
MUX8991_loom.var_names_make_unique()
MUX9780_loom = scv.read("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas4_count/velocyto/MacFas4_count.loom", cache=True);
MUX9780_loom.var_names_make_unique()
SSEA4_loom = scv.read("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Macaque_Spermatogenesis/MacFas4_SSEA4_count/velocyto/MacFas4_SSEA4_count.loom", cache=True);
SSEA4_loom.var_names_make_unique()
mf_velodata = MUX8555_loom.concatenate(MUX8990_loom, MUX8991_loom, MUX9780_loom, SSEA4_loom)
# Rename the cell ids
batchid=[b.split(":")[1][16:] for b in mf_velodata.obs_names]
batch=np.array(batchid)
batch[np.isin(batchid,"x-0")]="-0"
batch[np.isin(batchid,"x-1")]="-1"
batch[np.isin(batchid,"x-2")]="-2"
batch[np.isin(batchid,"x-3")]="-3"
batch[np.isin(batchid,"x-4")]="-4"
cellid=[c.split(":")[1][:16] for c in mf_velodata.obs_names]
new_cellid=[s + "-1"+t for s,t in zip(cellid,batch)]
mf_velodata.obs_names=new_cellid
## Estimate RNA velocity for the Spermatogonia population
SPG=[s for s in mf_sce_germSPG.obs_names if s in mf_velodata.obs_names]
germSPG_velodata=mf_velodata[SPG].copy()
germSPG_velodata
germSPG_velodata.obsm["X_tsne"] = mf_sce_germSPG.obsm["X_tsne"]
germSPG_velodata.obs["Annotated"] = mf_sce_germSPG.obs["Annotated"]
germSPG_velodata.uns["Annotated_colors"] = mf_sce_germSPG.uns["Annotated_colors"]
scv.pp.filter_and_normalize(germSPG_velodata, min_shared_counts=20, n_top_genes=4000)
scv.pp.moments(germSPG_velodata, n_pcs=30, n_neighbors=30)
scv.tl.velocity(germSPG_velodata,mode="stochastic", vkey="stochastic_velocity")
scv.tl.velocity_graph(germSPG_velodata, vkey="stochastic_velocity")
scv.tl.velocity_embedding(germSPG_velodata, basis="tsne", vkey="stochastic_velocity")
scv.settings.set_figure_params(figsize=(6,6), dpi=100)
scv.pl.velocity_embedding_stream(germSPG_velodata, basis="tsne", color=["Annotated"],
vkey="stochastic_velocity", title="",min_mass=0,
legend_fontsize=14)
scv.pl.velocity_embedding(germSPG_velodata, basis="tsne", arrow_length=5, arrow_size=5, dpi=100, color="Annotated",
vkey="stochastic_velocity", title="",legend_loc="on data", legend_fontsize=10)
scv.tl.recover_dynamics(germSPG_velodata)
scv.tl.latent_time(germSPG_velodata)
mf_sce_germSPG.obs["latent_time"] = germSPG_velodata.obs["latent_time"]
mf_sce_germSPG.obs["velocity_pseudotime"] = germSPG_velodata.obs["velocity_pseudotime"]
mf_sce_germSPG.uns["velocity_graph"] = germSPG_velodata.uns["stochastic_velocity_graph"]
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf")
# Set context to `"paper"`
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
sc.pl.tsne(mf_sce_germSPG, color=["latent_time"], legend_loc="on data", ncols=1, legend_fontsize=6, size=40, edgecolor="black",
cmap="viridis", linewidths=0.1)
%%R -o mf_sce_germ
## GERM
mf_sce_germ <- mf_sce[,mf_sce$broadcluster=="Germ"]
mf_sce_germ <- mf_sce_germ[Matrix::rowSums(counts(mf_sce_germ)) > 0, ]
mf_sce_germ$mf_leiden_clusters <- factor(mf_sce_germ$mf_leiden_clusters)
# Identify HVGs based on sample
set.seed(55063)
mod_mf_batch_germ <- modelGeneVar(mf_sce_germ, block=colData(mf_sce_germ)$Sample)
hvgs_mf_batch_germ <- getTopHVGs(mod_mf_batch_germ, prop=0.1)
print(paste("HVGs used:", length(hvgs_mf_batch_germ)))
set.seed(55063)
mf_sce_germ_corrected <- fastMNN(mf_sce_germ, subset.row=hvgs_mf_batch_germ, batch=mf_sce_germ$Sample)
reducedDim(mf_sce_germ, "mnn") <- reducedDim(mf_sce_germ_corrected, "corrected")
mf_sce_germ$mf_leiden_clusters <- as.character(mf_sce_germ$mf_leiden_clusters)
mf_sce_germ.var_names_make_unique()
mf_sce_germ.obs['Library'] = mf_sce_germ.obs['Sample']
mf_sce_germ.obs['batch'] = mf_sce_germ.obs['Library'].replace({
"MUX8555":"-0",
"MUX8990": "-1",
"MUX8991": "-2",
"MUX9780": "-3",
"SSEA4": "-4"
})
mf_sce_germ.obs['Sample'] = mf_sce_germ.obs['Library'].replace({
"MUX8555":"Adult1",
"MUX8990": "Infant",
"MUX8991": "Juvenile",
"MUX9780": "Adult2",
"SSEA4": "Adult2_SSEA4+"
})
mf_sce_germ.obs.index = mf_sce_germ.obs['Barcode']+mf_sce_germ.obs['batch']
mf_sce_germ.obs['Sample'].value_counts()
mf_sce_germ.obs['sum'].describe() # UMI summary statistics
mf_sce_germ.obs['detected'].describe() # genes detected - summary statistics
mf_sce_germ.obs['Annotated'] = mf_sce_germ.obs['mf_leiden_clusters']
mf_sce_germ.obs['Annotated'] = mf_sce_germ.obs['Annotated'].astype(str)
for idx, x in mf_sce_germSPG.obs['Annotated'].iteritems():
mf_sce_germ.obs.at[idx, 'Annotated'] = x
mf_sce_germ.obsm['X_tsne'] = mf_sce_germ.obsm['mnnTSNE']
mf_sce_germ.obsm['X_pca'] = mf_sce_germ.obsm['mnn']
sc.pl.tsne(mf_sce_germ, color=["Annotated"], size=8, legend_loc="on data", edgecolor="black",
linewidths=0.1, ncols=3, legend_fontsize=8, alpha=0.6, legend_fontoutline=1)
mf_sce_germ.obs["Annotated"] = mf_sce_germ.obs["Annotated"].replace({
"36":"Lep",
"33": "Zyg",
"17": "Pach1",
"8": "Pach2",
"16": "Pach3",
"25": "Pach4",
"1": "Pach5",
"24": "Pach6",
"15": "Pach7",
"21": "Pach8",
"10": "Pach9",
"11": "Pach10",
"26": "Dip & Sec. Spcs",
"13": "RS1",
"12": "RS2",
"22": "RS3",
"20": "RS4",
"23": "RS5",
"32": "RS6",
"29": "RS7",
"38": "ES1",
"14": "ES2",
"18": "ES3",
"7": "ES4",
"30": "ES5",
"6": "ES6",
"19": "Imm. Sperm"
})
mf_sce_germ.obs["Annotated"] = mf_sce_germ.obs["Annotated"].astype("category").cat.reorder_categories(["Undiff1", "Undiff2", "Undiff3",
"E-diff1", "E-diff2","E-diff3", "E-diff4", "Mid diff", "Late diff",
"pre-Lep", "Lep", "Zyg",
"Pach1","Pach2","Pach3","Pach4","Pach5","Pach6","Pach7","Pach8","Pach9", "Pach10", "Dip & Sec. Spcs",
"RS1","RS2","RS3","RS4","RS5","RS6","RS7",
"ES1","ES2","ES3","ES4","ES5","ES6","Imm. Sperm"],ordered=True)
bc_cols = ["#FFF5EB", "#FEE6CE", "#FDD0A2", "#FDAE6B", "#FD8D3C", "#F16913", "#D94801", "#A63603", "#7F2704",
"#F7FCF5", "#E4EDE4", "#D1DFD3", "#BED1C2", "#ABC3B1", "#98B5A1", "#84A790", "#71987F", "#5F8A6E", "#4C7C5E", "#386E4D", "#25603C", "#12522B", "#00441B",
"#EFF3FF", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#084594",
"#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d", "#6c2167"]
sc.pl.tsne(mf_sce_germ, color=["Annotated"], size=10, palette=bc_cols,
edgecolor="black", linewidths=0.1, ncols=4, legend_fontsize=10, title="")
## Integrate all annotations on the whole data
mf_sce_gs.obs['Annotated'] = mf_sce_gs.obs['broad_clusters']
mf_sce_gs.obs['Annotated'] = mf_sce_gs.obs['Annotated'].astype('str')
for idx, x in mf_sce_germ.obs['Annotated'].iteritems():
mf_sce_gs.obs.at[idx, 'Annotated'] = x
mf_sce_germ.obs['broad_germ'] = None
for idx, x in mf_sce_gs.obs['broad_clusters'][mf_sce_gs.obs['broadcluster'].isin(['Germ'])].iteritems():
mf_sce_germ.obs.at[idx, 'broad_germ'] = x
mf_sce_germ.obs['broad_germ'] = mf_sce_germ.obs['broad_germ'].astype("category").cat.reorder_categories(["Spermatogonia", "Spermatocytes", "Round spermatids", "Elongating spermatids","Immature Sperm"], ordered=True)
mf_sce_gs.obs["Annotated"] = mf_sce_gs.obs["Annotated"].astype("category").cat.reorder_categories(["Myoid","Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial", "Undiff1", "Undiff2", "Undiff3",
"E-diff1", "E-diff2","E-diff3","E-diff4", "Mid diff", "Late diff",
"pre-Lep", "Lep", "Zyg",
"Pach1","Pach2","Pach3","Pach4","Pach5","Pach6","Pach7","Pach8","Pach9", "Pach10", "Dip & Sec. Spcs",
"RS1","RS2","RS3","RS4","RS5","RS6","RS7",
"ES1","ES2","ES3","ES4","ES5","ES6","Imm. Sperm"],ordered=True)
palette = ["#FFB292","gold", "lightslategray", "#936C00","lime", "red",
"#FFF5EB", "#FEE6CE", "#FDD0A2", "#FDAE6B", "#FD8D3C", "#F16913", "#D94801", "#A63603", "#7F2704",
"#F7FCF5", "#E4EDE4", "#D1DFD3", "#BED1C2", "#ABC3B1", "#98B5A1", "#84A790", "#71987F", "#5F8A6E", "#4C7C5E", "#386E4D", "#25603C", "#12522B", "#00441B",
"#EFF3FF", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#084594",
"#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d", "#6c2167"]
sc.pl.tsne(mf_sce_gs, color=["Annotated"], size=10, palette=palette,
edgecolor="black", linewidths=0.05, legend_fontsize=10, title="")
sc.tl.rank_genes_groups(mf_sce_germ, "Annotated", n_genes = 6000, method = "wilcoxon", layer = "logcounts", use_raw = False)
germlist = {}
for i in mf_sce_germ.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(mf_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
germlist[i] = genes
for key, value in germlist.items():
#print value
print(key, len([item for item in value if item]))
germ_L1 = []
for i in mf_sce_germ.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(mf_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
germ_L1.extend(genes)
germ_unique_genes = list(unique_everseen(germ_L1))
len(germ_unique_genes)
ax, mat_germ = sc.pl.matrixplot(mf_sce_germ, germ_L1, groupby="Annotated", figsize=(10, 4),standard_scale="var", linewidth=.00001,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts",show=False, show_gene_labels=False)
mf_sce_germ.obsm["X_pca"] = mf_sce_germ.obsm["mnn"]
sc.pp.neighbors(mf_sce_germ, n_neighbors = 5) # Find the neighbours
sc.tl.paga(mf_sce_germ, groups="Annotated") # PAGA - Partition-based graph abstraction
sc.pl.paga(mf_sce_germ, color="Annotated", threshold=0.2848, fontsize=8, node_size_scale=1.8,
node_size_power=1, edge_width_scale=0.5, fontoutline=1, frameon=False)
mf_sce_germ.obs['germAD_clusters'] = mf_sce_germ.obs['Annotated']
mf_sce_germ.X = mf_sce_germ.layers["logcounts"]
# Load gene annnotation file
allgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/gene_annotation.txt", index_col=0)
#chr_genes = {}
for ch in allgenes['Chromosome'].astype("category").cat.categories.tolist():
genes = allgenes["Chromosome"]==ch
Agenes = set(allgenes[genes].index.tolist()).intersection(mf_sce_germ.var_names.tolist())
mf_sce_germ.obs[ch] = np.ravel((mf_sce_germ[:, list(Agenes)].X != 0).sum(axis=1))
AllChr_df = pd.DataFrame()
for column in ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y']:
x = mf_sce_germ.obs[column]
AllChr_df[column] = x
AllChr_df['germAD_clusters'] = mf_sce_germ.obs['germAD_clusters']
AllChr_df_melted = AllChr_df.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
AllChr_df_melted_mean = AllChr_df_melted.groupby(["germAD_clusters", "key"], as_index=False).mean()
#Reshape the data
AllChr_df_mean_reshaped = AllChr_df_melted_mean.pivot(index="key",columns="germAD_clusters")
AllChr_df_melted_std = AllChr_df_melted.groupby(["germAD_clusters", "key"]).std().reset_index()
AllChr_df_melted_std_reshaped = AllChr_df_melted_std.pivot(index="key",columns="germAD_clusters")
plt.rcParams["axes.linewidth"] = 0.2
plt.rcParams["figure.figsize"]=7,3
cols = ["#87CEEB", "#83C8EA", "#7FC3E9", "#7BBEE9", "#78B8E8", "#74B3E8",
"#70AEE7", "#6DA8E7", "#69A3E6", "#659EE6", "#6298E5", "#5E93E5",
"#5A8EE4", "#5788E4", "#5383E3", "#4F7EE3", "#4C78E2", "#4873E2",
"#446EE1", "#4169E1", "darkorange"]
i=0
for c in ['1', '2','3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14',
'15', '16', '17', '18', '19', '20', 'Y']:
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(),
AllChr_df_mean_reshaped.loc[c], linewidth=0.75, color=cols[i], label = c,
alpha=1)
i+=1
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(),
AllChr_df_mean_reshaped.iloc[21], linewidth=0.75, marker='', color= "#C70039", label="X")
# Add legend
plt.legend(loc="center", ncol=2, fontsize=6, bbox_to_anchor=(1.06, 0.58))
# Add titles
#plt.title("Mean no. of genes", loc='left', fontsize=12, fontweight=2, color='black')
plt.ylabel("Mean no. of genes", fontsize=8)
plt.xlabel("")
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor")
plt.tick_params(axis="both", which="minor", left=False)
plt.tick_params(axis="both", which="major", bottom=True, top=False, left=True, labelbottom=True, pad=0.5, length=1.5, labelsize=8)
plt.yscale('log', basey=10)
plt.tight_layout()
plt.margins(0.003,0.003)
plt.grid(linestyle='-', linewidth='0.1')
# Retrive chromosomes X, Y, 9 and autosomal genes for X to A ratio
chrX_genes = allgenes["Chromosome"]=="X"
chrY_genes = allgenes["Chromosome"]=="Y"
chr9_genes = allgenes["Chromosome"]=="9"
chrA_genes = allgenes[~allgenes.Chromosome.isin(["X","Y", "MT"])]
# Retrieve the X and Y genes specifically present in the germ cells alone
germ_Xgenes = set(allgenes[chrX_genes].index.tolist()).intersection(mf_sce_germ.var_names.tolist())
germ_Ygenes = set(allgenes[chrY_genes].index.tolist()).intersection(mf_sce_germ.var_names.tolist())
germ_9genes = set(allgenes[chr9_genes].index.tolist()).intersection(mf_sce_germ.var_names.tolist())
germ_Agenes = set(chrA_genes["gene_short_name"].tolist()).intersection(mf_sce_germ.var_names.tolist())
# Calculate X to Autosome ratio, also Y to Autosome and as reference chr9 to Autosomes
mf_sce_germ.obs["Chr X"] = np.mean(
mf_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.mean(mf_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
mf_sce_germ.obs["Chr 9"] = np.mean(
mf_sce_germ[:, list(germ_9genes)].X, axis=1).A1 / np.mean(mf_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
germXA_data = mf_sce_germ.obs["germAD_clusters"].to_frame().join(mf_sce_germ.obs["Chr 9"]).join(mf_sce_germ.obs["Chr X"])
#Melt the data to long format
germXA_data_melted = germXA_data.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
## Test if the X:A ratio is significantly different between Late diff vs Zyg
from scipy import stats
stats.mannwhitneyu(germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Late diff'])) & (germXA_data_melted['key'].isin(['Chr X']))].value,
germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Zyg'])) & (germXA_data_melted['key'].isin(['Chr X']))].value)
## Plot the mean X:A ratio of each cell as violin
from matplotlib.ticker import FormatStrFormatter
plt.rcParams["figure.figsize"]=7,3
bx = sns.violinplot(x="germAD_clusters", y="value",
hue="key", palette=["#0D8DB3","#C70039"],fliersize=0.1,linewidth=0.5,split=True,scale="count",inner="quartile",
data=germXA_data_melted)
bx.set(xlabel="")
bx.set_ylabel("Chromosome:Autosome ratio", fontsize=8)
bx.set_xticklabels(bx.get_xticklabels(), rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor", visible=True)
bx.grid(linestyle='-', linewidth='0.1')
bx.tick_params(
axis="both", # changes apply to the x-axis
which="major", # both major and minor ticks are affected
bottom=True, # ticks along the bottom edge are off
top=False, left=True, labeltop=False, # ticks along the top edge are off
labelbottom=True, pad=0.5, length=1.5)
bx.axhline(y=0.5,color='gray',linestyle='--')
bx.axhline(y=1,color='gray',linestyle='--')
plt.legend(loc="upper right", fontsize=6)
plt.tight_layout()
sc.pp.neighbors(mf_sce_germ, n_pcs=50, knn=False, method="gauss")# Find the neighbours
sc.tl.diffmap(mf_sce_germ)
mf_sce_germ.uns["iroot"] = np.flatnonzero(mf_sce_germ.obs["germAD_clusters"] == "Undiff1")[0] # Set the root cell
sc.tl.dpt(mf_sce_germ)
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf")
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":12,"axes.titlesize":12})
sc.pl.tsne(mf_sce_germ, color=["dpt_pseudotime"], legend_fontsize=10, cmap= "viridis",
size=20, edgecolor="black", linewidths=0.05,title="") #Use gauss method to find neighbors to get proper pseudotime
# Load chr X gene annnotation file
Xgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Macaque/Macaque_Xgenes_Annotated_updated.csv")
Xgenes.index = Xgenes['gene_short_name'].tolist()
germXsingle_sce = mf_sce_germ[:,mf_sce_germ.var['Symbol'][mf_sce_germ.var['Symbol'].isin(Xgenes['gene_short_name'][Xgenes['gene_class']=="SingleCopy"])].tolist()]
germXMulti_sce = mf_sce_germ[:,mf_sce_germ.var['Symbol'][mf_sce_germ.var['Symbol'].isin(Xgenes['gene_short_name'][Xgenes['gene_class']=="MultiCopy"])].tolist()]
germXAmpliconic_sce = mf_sce_germ[:,mf_sce_germ.var['Symbol'][mf_sce_germ.var['Symbol'].isin(Xgenes['gene_short_name'][Xgenes['gene_class']=="Ampliconic"])].tolist()]
germY_sce = mf_sce_germ[:,list(germ_Ygenes)]
sc.tl.rank_genes_groups(germXsingle_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXMulti_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXAmpliconic_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germY_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
singleX_L1 = []
for i in germXsingle_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXsingle_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
singleX_L1.extend(genes)
MultiX_L1 = []
for i in germXMulti_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXMulti_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
MultiX_L1.extend(genes)
Ampli_L1 = []
for i in germXAmpliconic_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXAmpliconic_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
Ampli_L1.extend(genes)
singleY_L1 = []
for i in germY_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germY_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
singleY_L1.extend(genes)
singleX_unique = pd.Series(singleX_L1).drop_duplicates().tolist()
MultiX_unique = pd.Series(MultiX_L1).drop_duplicates().tolist()
Ampli_unique = pd.Series(Ampli_L1).drop_duplicates().tolist()
singleY_unique = pd.Series(singleY_L1).drop_duplicates().tolist()
mf_sce_germ.uns['singleX_unique'] = singleX_unique
mf_sce_germ.uns['MultiX_unique'] = MultiX_unique
mf_sce_germ.uns['Ampli_unique'] = Ampli_unique
mf_sce_germ.uns['singleY_unique'] = singleY_unique
print("No. of single X copy genes: %d" %len(singleX_unique))
print("No. of multi X copy genes: %d" %len(MultiX_unique))
print("No. of ampliconic X genes: %d" %len(Ampli_unique))
print("No. of Y genes: %d" %len(singleY_unique))
XYgenes = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique, 'Y':singleY_unique}
X_sma = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique}
ax = sc.pl.matrixplot(mf_sce_germ, X_sma, groupby="Annotated", figsize=(8,8),standard_scale="var", linewidth=.01,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
ax = sc.pl.matrixplot(mf_sce_germ, {'Multi':MultiX_unique,'Ampli':Ampli_unique}, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.01,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)
Xgenes_mf = Xgenes[Xgenes['gene_short_name'].isin(mf_sce_germ.var['Symbol'])]
Xgenes_mf['gene_class'].value_counts()
pd.crosstab(Xgenes_mf['gene_class'], Xgenes_mf['Shared'])
MultiX_unique_shared = {}
for i in Xgenes_mf['Shared'].astype('category').cat.categories:
genes = [x for x in MultiX_unique if x in Xgenes_mf['gene_short_name'][(Xgenes_mf['gene_class']=='MultiCopy') & (Xgenes_mf['Shared']==i)].tolist()]
MultiX_unique_shared[i] = genes
ax = sc.pl.matrixplot(mf_sce_germ, MultiX_unique_shared, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.1,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)
AmpliX_unique_shared = {}
for i in Xgenes_mf['Shared'].astype('category').cat.categories:
genes = [x for x in Ampli_unique if x in Xgenes_mf['gene_short_name'][(Xgenes_mf['gene_class']=='Ampliconic') & (Xgenes_mf['Shared']==i)].tolist()]
AmpliX_unique_shared[i] = genes
ax = sc.pl.matrixplot(mf_sce_germ, AmpliX_unique_shared, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.1,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)
sc.pl.dotplot(mf_sce_germ,mf_sce_germ.uns['singleY_unique'], groupby="Annotated", figsize=(8,6), standard_scale="var",color_map=heatcolors_wr,
dendrogram=False, layer="logcounts")
mf_sce_germ.obs['ForPie'] = mf_sce_germ.obs['Annotated']
mf_sce_germ.obs['ForPie'] = mf_sce_germ.obs['ForPie'].replace({
"Mid diff":"Mid-diff",
"Late diff":"Late-diff",
"Dip & Sec. Spcs":"Dip-SecSpcs"
})
X_pie_df = pd.DataFrame(columns=["Unexpressed", "Single-copy", "Multicopy", "Ampliconic"], index=['Undiff1', 'Undiff2', 'Undiff3', 'E-diff1', 'E-diff2', 'E-diff3',
'E-diff4', 'Mid-diff', 'Late-diff', 'pre-Lep', 'Lep', 'Zyg', 'Pach1',
'Pach2', 'Pach3', 'Pach4', 'Pach5', 'Pach6', 'Pach7', 'Pach8', 'Pach9',
'Pach10', 'Dip-SecSpcs', 'RS1', 'RS2', 'RS3', 'RS4', 'RS5', 'RS6',
'RS7', 'ES1', 'ES2', 'ES3', 'ES4', 'ES5', 'ES6', 'Imm. Sperm'])
for i in mf_sce_germ.obs['ForPie'].astype('category').cat.categories.tolist():
df = i+"_df"
df = mf_sce_germ[mf_sce_germ.obs['ForPie'].isin([i]),:]
df = df[:,list(germ_Xgenes)]
sc.pp.filter_genes(df, min_cells=round((df.n_obs*0.05)));
X_pie_df.loc[i] = [934-df.n_vars, len(df.var['Symbol'][df.var['Symbol'].isin(singleX_unique)]), len(df.var['Symbol'][df.var['Symbol'].isin(MultiX_unique)]), len(df.var['Symbol'][df.var['Symbol'].isin(Ampli_unique)])]
X_pie_df = X_pie_df.div(X_pie_df.sum(axis=1), axis=0)
fig, axs = plt.subplots(ncols=37, nrows=1, figsize=(20,3))
fig.subplots_adjust(hspace=0.5, wspace=0.05)
for row, ax in enumerate(axs.flatten()):
ax.pie(X_pie_df.loc[X_pie_df.index[row],:],radius = 1, labels=None, colors=["darkgrey", "midnightblue", "darkorange", "forestgreen"])
ax.set_title(X_pie_df.index[row], fontsize=10, rotation=40, horizontalalignment='left')
fig.legend(X_pie_df.columns, ncol=1,loc="upper right", bbox_to_anchor=(0.5, 0.4), fontsize=10)
plt.show()
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf")
sns.set_context("paper", font_scale=1, rc={"font.size":6,"axes.labelsize":12,"axes.titlesize":12})
## Used the term "DNA damage response" in Panther database to retrive DDR genes
sub1 = ['BRCA2', 'MSH3', 'PMS1', 'BAX', 'RPA2', 'XPA', 'TP53BP1', 'MSH2', 'RAD52', 'FANCC', 'XRCC3',
'ENSMFAG00000000382', 'BRCA1', 'RECQL4', 'XRCC4', 'RECQL', 'PARP2',
'ENSMFAG00000039395', 'RAD51C', 'ENSMFAG00000036138', 'ATM', 'RECQL5', 'ENSMFAG00000033904',
'RAD21L1', 'NIPBL', 'ATR', 'RAD9B', 'MLH3', 'MSH4', 'MSH6', 'HUS1B', 'RAD17', 'LIG4', 'RAD1', 'XRCC6', 'FANCG', 'RAD51',
'XRCC5', 'MLH1', 'RAD9A', 'MDC1', 'MRE11', 'SPO11', 'CBX1', 'H2AFX', 'HORMAD1', 'HORMAD2', 'SETDB1', 'TOPBP1', 'TRIM28']
DDR_sub_unique_sce = mf_sce_germ[:,sub1]
sc.tl.rank_genes_groups(DDR_sub_unique_sce, "Annotated", n_genes=210, method="wilcoxon", layer="logcounts", use_raw=False)
L1 = []
for i in DDR_sub_unique_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(DDR_sub_unique_sce, group=i,log2fc_min=1,pval_cutoff=1, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
L1.extend(genes)
sub1_unique = pd.Series(L1).drop_duplicates().tolist()
len(sub1_unique)
ax = sc.pl.matrixplot(mf_sce_germ,sub1_unique, groupby="Annotated", figsize=(7,5),standard_scale="var", linewidth=.1,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)
mf_unique = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/17Apr_rev/Mf_unique.txt")
Orthologs = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/Human_Mouse_Monkey_orthologs_21Feb_clean.txt", sep=" ")
Orthologs = Orthologs[(Orthologs.Crab_eating_macaque_homology_type=="ortholog_one2one") & (Orthologs.Mouse_homology_type=="ortholog_one2one")]
mf_ortho_df = pd.DataFrame(columns=["1:1:1 Othologs", "Macaque_specific", "Other orthologs"])
for i in mf_sce_germ.obs['Annotated'].astype('category').cat.categories.tolist():
df = i+"_df"
df = mf_sce_germ[mf_sce_germ.obs['Annotated'].isin([i]),:];
#df = df[:,list(germ_Xgenes)]
sc.pp.filter_genes(df, min_cells=round((df.n_obs*0.05)));
mf_ortho_df.loc[i] = [len(df.var['ID'][df.var['ID'].isin(Orthologs.Crab_eating_macaque_gene_stable_ID.tolist())])/len(df.var['ID'])*100,
len(df.var['ID'][df.var['ID'].isin(mf_unique['x'].tolist())])/len(df.var['ID'])*100,
len(df.var['ID'][~df.var['ID'].isin(Orthologs.Crab_eating_macaque_gene_stable_ID.tolist()+mf_unique['x'].tolist())])/len(df.var['ID'])*100]
mf_ortho_df['Groups'] = mf_ortho_df.index
mf_ortho_df_melted = mf_ortho_df.melt('Groups', var_name='Type', value_name='value')
g=sns.catplot(x="Groups", y="value", hue='Type', data=mf_ortho_df_melted, kind="point",
palette = ["#DF8F44FF", "#4B706B", "#80796BFF"],s=0.1, height=5, aspect=1.3, scale = 0.6)
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor");
plt.xlabel("");
plt.ylabel("% Genes expressed", fontsize=10);